from pandas import DataFrame, Series
from dateutil.parser import parse
import pandas as pd
import numpy as np
import statsmodels.api as sm
from hcquant.industry import get_industry

def zscore(df):
    return (df - df.mean()) / df.std(ddof=0)

# df m * n
def winsorize(df, winsorize_time=2, level=3):
    """
    winsorize 分位数截尾法

    Args:
        df: pandas DataFrame with index of stocks and columns of factor
            stocks * factors
        winsorize_time: times to winsorize
        level: th degree of tail control 

    Returns:
        df with winsorize like args
    """
    def _winsorize(df, level=level):
        temp_median = df.median(axis=0) # 1 * n
        median_of_distance =  (df - temp_median).apply(abs).median(axis=0) # 1 * n
        
        below_limit = temp_median - level * median_of_distance # 1 * n
        upper_limit = temp_median + level * median_of_distance # 1 * n
        if isinstance(df, DataFrame):
            df = df.where(~(df >= upper_limit), upper_limit, axis=1)
            df = df.where(~(df <= below_limit), below_limit, axis=1)
        elif isinstance(df, Series):
            df = df.where(~(df >= upper_limit), upper_limit)
            df = df.where(~(df <= below_limit), below_limit)
        else:
            raise TypeError('input must be Dataframe or Series')
        return df
        
    for _ in range(winsorize_time):
        df = _winsorize(df, level=level)

    return df


def neutralize(factor_data, trade_dt, engine, market=False):
    """
    返回时间截面上中性化处理好之后的中性化因子Series
    factor_data 要求第一列为sid， 第二列为factor_value
    dataframe
    如果market为true就不做市值中性了
    # 注意因子不要有缺失值，后续看看fit的参数
    # 还是需要数据库啊 我日
    """
    trade_dt = parse(trade_dt).strftime('%Y%m%d') if isinstance(trade_dt, str) \
                else trade_dt.strftime('%Y%m%d')
    df2 = get_industry(trade_dt, engine)
    factor = factor_data.copy()
    for ind in df2['indname'].unique():
        df2[ind] = (df2['indname'] == ind)
    factor.columns = ['sid', 'value']
    # 这一步有点问题 长度可能变 用历史数据去做的话存在这个问题
    factor = factor.merge(df2, on='sid', how='left')
    if not market:
        fit_list = list(set(df2.columns) - {'sid', 'indname'})
    else:
        fit_list = list(set(df2.columns) - {'sid', 'indname', 'mkt_value'})
    X = sm.add_constant(factor[fit_list].astype(float))
    y = factor['value']
    model = sm.OLS(y, X)
    results = model.fit()
    return y - results.fittedvalues